Introduction

For our final group project, we wanted to help a private wine trader from Chile to enter the US market. He has no prior business knowledge of the US market and hence we aimed to answer some key questions with the help of different data sets and visualizations.

Some of the key topics we address is:

While we do not exhaust all possible factors of interest, we generate some ideas and starting poitns for further hypothesis testing for a more detailed business plan.

For our analysis we used the following data sources:

Create own graph template

loadfonts(device="pdf")

custom_layout <-   theme_minimal() + theme(
     
    text=element_text(family = "PT Sans"),
    plot.title = element_text(face="bold", size = rel(1.7)), 
    plot.subtitle= element_text(color="grey60", size = rel(1.6)),
    panel.background = element_blank(),
    plot.caption = element_text(hjust = 0, size = rel(1.2), face="italic", color="grey60"), 
    plot.title.position = "plot", 
    plot.caption.position =  "plot", 
    axis.title = element_text(color="grey50", size = rel(1.4)), 
    axis.ticks= element_line(color="grey50", size = rel(1.4)), 
    axis.line = element_line(size = 1, colour = "grey80"), 
    axis.text =element_text(color="grey50", size = rel(1)),
    legend.title = element_text(family = "PT Sans",face = "bold", rel(1.4)))

Economic Indicator Analysis

Loading and cleaning economic indicator data

Before we can start with the analysis, we load and clean the data.

# Data Source: https://apps.bea.gov/regional/downloadzip.cfm
gdp <- read_csv(here::here("data", "SAGDP1__ALL_AREAS_1997_2020.csv"))
pce <- read_csv(here::here("data", "SAPCE4__ALL_AREAS_1997_2020.csv"))


states <- read_csv(here::here("data", "states.csv"))
state_pop <- read_csv(here::here("data", "2019_Census_US_Population_Data_By_State_Lat_Long.csv"))

#GDP reported in million
glimpse(gdp)
## Rows: 484
## Columns: 32
## Warning: One or more parsing issues, see `problems()` for details
## $ GeoFIPS                <chr> "00000", "00000", "00000", "00000", "00000", "0…
## $ GeoName                <chr> "United States", "United States", "United State…
## $ Region                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 5, 5, 5, 5, 5, …
## $ TableName              <chr> "SAGDP1", "SAGDP1", "SAGDP1", "SAGDP1", "SAGDP1…
## $ LineCode               <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,…
## $ IndustryClassification <chr> "...", "...", "...", "...", "...", "...", "..."…
## $ Description            <chr> "Real GDP (millions of chained 2012 dollars)", …
## $ Unit                   <chr> "Millions of chained 2012 dollars", "Quantity i…
## $ `1997`                 <dbl> 11529157.000, 70.931, 8577552.000, 4713220.000,…
## $ `1998`                 <dbl> 12045824.000, 74.110, 9062817.000, 5075701.000,…
## $ `1999`                 <dbl> 12623361.000, 77.663, 9631172.000, 5409937.000,…
## $ `2000`                 <dbl> 13138035.000, 80.830, 10250952.000, 5854634.000…
## $ `2001`                 <dbl> 13263417.000, 81.601, 10581929.000, 6046346.000…
## $ `2002`                 <dbl> 13488357.000, 82.985, 10929108.000, 6143370.000…
## $ `2003`                 <dbl> 13865519.000, 85.305, 11456450.000, 6362298.000…
## $ `2004`                 <dbl> 14399696.000, 88.592, 12217196.000, 6729306.000…
## $ `2005`                 <dbl> 14901269.000, 91.678, 13039197.000, 7077722.000…
## $ `2006`                 <dbl> 15315943.000, 94.229, 13815583.000, 7491260.000…
## $ `2007`                 <dbl> 15623871.000, 96.123, 14474228.000, 7889371.000…
## $ `2008`                 <dbl> 15642962.000, 96.241, 14769862.000, 8068682.000…
## $ `2009`                 <dbl> 15236262.000, 93.739, 14478067.000, 7767191.000…
## $ `2010`                 <dbl> 15648991.000, 96.278, 15048970.000, 7932970.000…
## $ `2011`                 <dbl> 15891534.000, 97.770, 15599731.000, 8234017.000…
## $ `2012`                 <dbl> 16253970.0, 100.0, 16253970.0, 8575362.0, 66005…
## $ `2013`                 <dbl> 16553348.000, 101.842, 16843196.000, 8843637.00…
## $ `2014`                 <dbl> 16932051.000, 104.172, 17550687.000, 9259654.00…
## $ `2015`                 <dbl> 17390295.000, 106.991, 18206023.000, 9709535.00…
## $ `2016`                 <dbl> 17680274.000, 108.775, 18695106.000, 9977096.00…
## $ `2017`                 <dbl> 18079084.000, 111.229, 19479623.000, 10436745.0…
## $ `2018`                 <dbl> 18606787.000, 114.475, 20527159.000, 10969807.0…
## $ `2019`                 <dbl> 19032672.000, 117.096, 21372582.000, 11459449.0…
## $ `2020`                 <dbl> 18384687.000, 113.109, 20893746.000, 11580088.0…
glimpse(pce)
## Rows: 7,985
## Columns: 32
## Warning: One or more parsing issues, see `problems()` for details
## $ GeoFIPS                <chr> "00000", "00000", "00000", "00000", "00000", "0…
## $ GeoName                <chr> "United States", "United States", "United State…
## $ Region                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ TableName              <chr> "SAPCE4", "SAPCE4", "SAPCE4", "SAPCE4", "SAPCE4…
## $ LineCode               <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ IndustryClassification <chr> "...", "...", "...", "...", "...", "...", "..."…
## $ Description            <chr> "Personal consumption expenditures", "Household…
## $ Unit                   <chr> "Millions of current dollars", "Millions of cur…
## $ `1997`                 <dbl> 5536790, 5431202, 474777, 412764, 61598, 414, 2…
## $ `1998`                 <dbl> 5877248, 5753441, 487437, 422087, 64968, 382, 2…
## $ `1999`                 <dbl> 6283758, 6145529, 515530, 444945, 70235, 350, 2…
## $ `2000`                 <dbl> 6767179, 6609139, 540578, 463125, 77136, 318, 2…
## $ `2001`                 <dbl> 7073801, 6894718, 564003, 482228, 81489, 286, 2…
## $ `2002`                 <dbl> 7348941, 7150598, 575052, 490397, 84374, 281, 2…
## $ `2003`                 <dbl> 7740749, 7535238, 599580, 513578, 85728, 275, 3…
## $ `2004`                 <dbl> 8231960, 8025589, 632604, 542995, 89269, 340, 3…
## $ `2005`                 <dbl> 8769066, 8558799, 668216, 575282, 92551, 384, 3…
## $ `2006`                 <dbl> 9277236, 9038051, 700260, 601564, 98231, 465, 3…
## $ `2007`                 <dbl> 9746594, 9497813, 737332, 634713, 102215, 403, …
## $ `2008`                 <dbl> 10050083, 9762791, 769085, 665819, 102877, 390,…
## $ `2009`                 <dbl> 9891218, 9602038, 772930, 669151, 103434, 345, …
## $ `2010`                 <dbl> 10260256, 9965818, 786866, 678586, 107895, 386,…
## $ `2011`                 <dbl> 10698857, 10387001, 819542, 709051, 110123, 368…
## $ `2012`                 <dbl> 11047363, 10705817, 846198, 731642, 114172, 384…
## $ `2013`                 <dbl> 11363528, 11010938, 863994, 748277, 115227, 489…
## $ `2014`                 <dbl> 11847725, 11482129, 896855, 776231, 120037, 588…
## $ `2015`                 <dbl> 12263476, 11891892, 920955, 794884, 125449, 622…
## $ `2016`                 <dbl> 12693266, 12291833, 940635, 809680, 130457, 498…
## $ `2017`                 <dbl> 13239111, 12821163, 973072, 837665, 134975, 433…
## $ `2018`                 <dbl> 13913531, 13468865, 1000345, 859057, 140852, 43…
## $ `2019`                 <dbl> 14428676, 13988786, 1030911, 884334, 146090, 48…
## $ `2020`                 <chr> "14047565", "13526689", "1146676", "979256", "1…
glimpse(states)
## Rows: 51
## Columns: 3
## $ state     <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "C…
## $ latitude  <dbl> 32.7794, 64.0685, 34.2744, 34.8938, 37.1841, 38.9972, 41.621…
## $ longitude <dbl> -86.8287, -152.2782, -111.6602, -92.4426, -119.4696, -105.54…
glimpse(state_pop)
## Rows: 51
## Columns: 4
## $ STATE           <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi…
## $ POPESTIMATE2019 <dbl> 4903185, 731545, 7278717, 3017804, 39512223, 5758736, …
## $ lat             <dbl> 32.37772, 58.30160, 33.44814, 34.74661, 38.57667, 39.7…
## $ long            <dbl> -86.30057, -134.42021, -112.09696, -92.28899, -121.493…
#Definitions of regions according to BEA
#https://apps.bea.gov/regional/docs/msalist.cfm?mlist=2
new_england <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
mideast <- c("Delaware", "District of Columbia", "Maryland", "New Jersey", "New York", "Pennsylvania")
great_lakes <- c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin")
plains <- c("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
south_east <- c("Alabama", "Arkansas", "Florida", "Georgia", "Kentucky", "Louisiana", "Mississippi", "North Carolina", "South Carolina", "Tennessee", "Virginia", "West Virginia")
southwest <- c("Arizona", "New Mexico", "Oklahoma", "Texas")
rocky_mountain <- c("Colorado", "Idaho", "Montana", "Utah", "Wyoming")
far_west <- c("Alaska", "California", "Hawaii", "Nevada", "Oregon", "Washington")

First we need to clean the data and toransform it into data frames we can use for our analysis.

gdp_clean <- gdp %>% 
  
  #Clean names
  janitor::clean_names() %>% 
  
  #Filter out row type we need 
  filter(description=="Real GDP (millions of chained 2012 dollars)") %>% 
  
  #Pivot longer to get right data format
  pivot_longer(cols = starts_with("x"), 
               names_to = "year", 
               values_to  = "GDP") %>%  
  
  #Delete first letter of year 
  mutate(year=substring(year, 2), year=lubridate::year(as.Date(year, format="%Y")), 
         #Convert GDP into billion 
         GDP=GDP/1000)

#Divide data into state and region level 
gdp_states <- gdp_clean %>%  filter(!geo_name %in% c("United States", "Far West", "Rocky Mountain", "Southwest","Southeast", "Plains", "Great Lakes", "Mideast", "New England" ))

gdp_regions  <- gdp_clean %>%  filter(geo_name %in% c("United States", "Far West", "Rocky Mountain", "Southwest","Southeast", "Plains", "Great Lakes", "Mideast", "New England" ))

#Join gdp_states with state latitute/longlitude data 
gdp_states <- gdp_states %>%  left_join(states, by=c("geo_name" = "state"))
gdp_states_2020 <- gdp_states %>%  filter(year==2020) %>%  filter(!geo_name %in% c("Alaska", "Hawaii"))
gdp_states_sf <- st_as_sf(gdp_states_2020, 
                          coords=c("longitude", "latitude"), 
                          crs=4326) 


gdp_states_2020_temp <- gdp_states_2020 %>% select(geo_name, GDP) 
#glimpse(gdp_states_sf)

#Create dataframe with GDP per capita 
gdp_perCapita_states_2020 <- gdp_states_2020 %>% left_join(state_pop, by=c("geo_name"="STATE")) %>% 
  select(!lat, !long) %>% 
  mutate(GDP_pc=(GDP/POPESTIMATE2019)*1000000000) %>%  select(geo_name, GDP_pc)

#GDP per capita for regions
region_pop <- state_pop %>%  select(STATE, POPESTIMATE2019) %>%  
  mutate(region=case_when(
    STATE %in% new_england ~  "New England", 
    STATE %in% mideast ~  "Mideast",
    STATE %in% great_lakes ~  "Great Lakes",
    STATE %in% plains ~  "Plains",
    STATE %in% south_east ~  "Southeast",
    STATE %in% southwest ~  "Southwest",
    STATE %in% rocky_mountain ~  "Rocky Mountain",
    STATE %in% far_west ~  "Far West"
    )) %>%  group_by(region) %>%  summarise(region_pop=sum(POPESTIMATE2019))


#Simplification assumption that population is constant 
gdp_pc_regions <- gdp_regions %>%  
  filter(geo_name != "United States") %>%  
  left_join(region_pop, by=c("geo_name" = "region")) %>%  
  mutate(gdp_pc = GDP/region_pop *1000000000)

Cleaning PCE data

#glimpse(pce)
pce_clean <- pce %>%
  
  #Clean names
  janitor::clean_names() %>% 
  
  mutate(x2020 = as.numeric(x2020)) %>%  
   
    #Pivot longer to get right data format
  pivot_longer(cols = 9:32, 
               names_to = "year", 
               values_to  = "Expenditure")  %>% 
  mutate(year=substring(year, 2), year=lubridate::year(as.Date(year, format="%Y")))

pce_total_annual <- pce_clean %>% 
  filter(description=="Personal consumption expenditures", geo_name =="United States") %>% group_by(year) %>% 
  #filter(!is.na(Expenditure)) %>%  
  mutate(population=(sum(region_pop$region_pop)), exp_percapita=Expenditure/population *1000000)


pce_pc_short <- pce_total_annual %>% select(year, exp_percapita)

GDP and PCE over time

First, we want to get a feeling of how GDP and PCE (Personal Consumption Expenditure) is developing over time in the US to get a better feeling of the economic situation in the US.

#Join GDP, PCE data 
eco_ind_us <- gdp_regions %>% 
  filter(geo_name == "United States") %>% 
  mutate(population=(sum(region_pop$region_pop)), gdp_pc=GDP/population*1000000000) %>% select(year, gdp_pc) %>%
  #left_join(Inc_pc_us_short, by="year") %>%
  left_join(pce_pc_short, by="year") %>% 
  pivot_longer(cols=2:3, names_to="indicator", values_to="value")

#Plot graphs with both indicators 
  ggplot(eco_ind_us, aes(x=year, y=value, group=indicator, color=indicator)) + 
    
  #Create gray background for crisis
  geom_rect(aes(xmin=2008,xmax=2009.5),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_rect(aes(xmin=2019.5,xmax=2021),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +  geom_rect(aes(xmin=2000,xmax=2002),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  
    geom_line(size=1.5) +
    scale_color_manual(values = c("#E6ADE6", "#024B7C"), 
                       labels=c("PCE", "GDP"))+
  custom_layout + 
   labs(title="Economic growth is slowed down during global economic crisis",
       subtitle = "GDP and PCE development over time in USA", 
       x="Time",
       y="GDP and PCE per capita in $", 
       caption= "Source: Bureau of Economic Analysis GDP and PCE Data 1997-2020", 
       color="Economic indicator") +
    
  # add the text label on the graph
  geom_text(
    data = data.frame(x = 2008.75, y = 59000, label = "Financial Crisis"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE) +
    
     geom_text(
    data = data.frame(x = 2020.2, y = 59000, label = "Covid-19"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE)+
  
     geom_text(
    data = data.frame(x = 2001, y = 59000, label = "Dot-com bubble"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE)

Alcohol beverage consumption expenditure

To evaluate if wine is a good industry to go into, we use the PCE data to get a first hypothesis. We calculate the proportion of alcohol spending of total consumer spending over time to get a first indication for this.

#PROPORTIONAL GROWTH OF ALCHOLIC BEVERAGES FROM TOTAL EXPENDITURE 
pce_alc <- pce_clean %>%  filter(description %in% c("Personal consumption expenditures","Alcoholic beverages purchased for off-premises consumption")) %>% 
  mutate(description = case_when(description == "Personal consumption expenditures" ~ "PCE",
                               description == "Alcoholic beverages purchased for off-premises consumption" ~ "Alc_beverages"))

pce_alc <- pce_alc %>% select(geo_name, year, description, Expenditure) %>% 
  group_by(geo_name, year) %>% 
  pivot_wider(names_from="description", values_from="Expenditure") %>%  
  mutate(Alc_pct=Alc_beverages/PCE*100)


#GRAPH ALCOHOL SPENDING AS % OF TOTAL SPENDING 
pce_alc %>%  filter(geo_name=="United States") %>%  
  ggplot(aes(x=year, y=Alc_pct)) +
    geom_rect(aes(xmin=2008,xmax=2009.5),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_rect(aes(xmin=2019.5,xmax=2021),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_rect(aes(xmin=2000,xmax=2002),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_point(size=2, color="#5C0303") + 
  geom_line(size=1.2, color="#5C0303") + 
  custom_layout + 
  ylim(c(0, 1.3)) +
   labs(title="Alcohol spending increases in times of crisis",
       subtitle = "Proportion of alcohol beverage spending of total Consumption Expenditures over time in USA", 
       x="Time",
       y="Alcoholic beverage spending (in %)", 
       caption= "Source: Bureau of Economic Analysis Personal Consumption Expenditures Data 1997-2020") +
  
    # add the text label on the graph
  geom_text(
    data = data.frame(x = 2008.75, y = 1.25, label = "Financial Crisis"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE) +
    
     geom_text(
    data = data.frame(x = 2020.2, y = 1.25, label = "Covid-19"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE) +
  
     geom_text(
    data = data.frame(x = 2001, y = 1.25, label = "Dot-com bubble"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE)

us_gdp <- gdp_regions %>% 
  filter(geo_name == "United States") %>%  
  select(year, GDP) %>%  
  mutate(prev_year_GDP=dplyr::lag(GDP,1), growth_annual_GDP=(GDP-prev_year_GDP)/prev_year_GDP*100)


alc_prop_of_pce <- pce_alc %>%  
  filter(geo_name=="United States", !is.na(Alc_pct)) %>%  
  select(geo_name,year, Alc_pct) %>% ungroup() %>% 
  mutate(prev_year_alc_prop=dplyr::lag(Alc_pct,1), growth_annual_alc_pct=(Alc_pct-prev_year_alc_prop)/prev_year_alc_prop*100)

us_gdp_alcohol_pct <-  us_gdp %>% left_join(alc_prop_of_pce, by="year") %>% filter(year!=1997) %>%  select(year, growth_annual_GDP, growth_annual_alc_pct) %>% pivot_longer(2:3, values_to="growth_rate", names_to="type")

ggplot(us_gdp_alcohol_pct, aes(x=year, y=growth_rate, group=type, color=type)) + 
  
  geom_rect(aes(xmin=2008,xmax=2009.5),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_rect(aes(xmin=2019.5,xmax=2021),
            ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_rect(aes(xmin=2000,xmax=2002),
           ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
  geom_line(size=1.2) +
  scale_color_manual(values = c("#8C1414", "#024B7C"), 
                       labels=c("Alcohol spending", "GDP")) + 
  geom_hline(yintercept=0, linetype="dashed", color = "black") +
  
  custom_layout + 
  labs(title="Proportion of alcohol spending increases in times of crisis",
       subtitle = "Proportion of alcohol beverage spending and GDP growth over time in USA", 
       x="Time",
       y="Annual Growth",
       caption="Source: Bureau of Economic Analysis GDP & Personal Consumption Expenditures Data 1997-2020", 
       color= "Annual growth")+
  
    # add the text label on the graph
  geom_text(
    data = data.frame(x = 2008.75, y = 18, label = "Financial Crisis"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE) +
    
     geom_text(
    data = data.frame(x = 2020.2, y = 18, label = "Covid-19"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE) +
  
     geom_text(
    data = data.frame(x = 2001, y = 18, label = "Dot-com bubble"),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE)

GDP and GDP growth per US region

To go a little deeper, we want to analyze GDP growth on region. It becomes visible that different regions have not only different GDP and different GDP per capita but also the average of annual growth over the last 20 years significantly differs, which should be taken into consideration when deciding where to enter the market.

# GDP over time by region 
gdp_regions %>%  filter(geo_name != "United States") %>% 
  ggplot(aes(x=year, y=GDP, group=geo_name)) + 
  geom_line(aes(color=geo_name)) +
  custom_layout + 
   labs(title="Southeast has the highest GDP",
       subtitle = "GDP development over time by region", 
       x="Time",
       y="GDP in $B", 
       caption= "Source: Bureau of Economic Analysis GDP Data 1997-2020", 
       color= "Regions")

# GDP per capita over time by region 
gdp_pc_regions %>%
  ggplot(aes(x=year, y=gdp_pc, group=geo_name)) + 
  geom_line(aes(color=geo_name)) +
  custom_layout + 
   labs(title="Far West, Mideast and New England have the highest GDP per capita",
       subtitle = "GDP per capita development over time by region", 
       x="Time",
       y="GDP per capita in $", 
       caption= "Source: Bureau of Economic Analysis GDP Data 1997-2020", 
       color= "Regions")

# GDP per capita increase from last for first for all regions  
gdp_pc_regions_growth <- gdp_pc_regions %>% 
  filter(year %in% c(2020, 1997)) %>%  
  group_by(region) %>% 
  
  #pivot wider to calculate change over years 
  pivot_wider(names_from=year, values_from=c(gdp_pc, GDP)) %>%  
  select(geo_name, gdp_pc_1997, gdp_pc_2020) %>%
  
  #calculate cagr and av. annual growth 
  mutate(cagr=(gdp_pc_2020/gdp_pc_1997)^(1/(2020-1997)-1), average_annual_growth=((gdp_pc_2020-gdp_pc_1997)/gdp_pc_1997)/(2020-1997)) %>%  
  mutate(blue=ifelse(geo_name %in% c("Southwest","Rocky Mountain", "Far West"), TRUE, FALSE))
  
#Plot findings from abovefct_reorder(country, beer_servings)
my_colours <- c("grey70", "#093172")
label <- "Growth mainly driven by Tech and Oil industry growth\n over the two decades"

gdp_pc_regions_growth %>%  ggplot(aes(x=fct_reorder(geo_name, desc(average_annual_growth)), y=average_annual_growth, fill=blue))+ 
  geom_col() +
  scale_y_continuous(labels = scales::percent)+
  scale_fill_manual(values = my_colours)+
   custom_layout + 
  theme(
    axis.title.y = element_blank(), 
    legend.position = "none" 
  )+
   labs(title="The West experienced higher GDP per capita growth over the last 20 years",
       subtitle = "GDP per capita development over time by region", 
       x="Regions",
       y="Annual GDP per capita growth rate (1997-2020)", 
       caption= "Source: Bureau of Economic Analysis GDP Data 1997-2020")+ 
  
  #add arrow to graph
   geom_curve(
    data = data.frame(x = 6, y = 0.03, xend = 3.7, yend = 0.038),
    mapping = aes(x = x, y = y, xend = xend, yend = yend),
    colour = "grey15",
    size = 0.5,
    curvature = 0.25,
    arrow = arrow(length = unit(2, "mm"), type = "closed"),
    inherit.aes = FALSE ) + 
  
    # add the text label on the graph
  geom_text(
    data = data.frame(x = 6, y = 0.028, label = label),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE)

US map with GDP and GDP per capita

Next we plot GDP and GDP per capita on a US map based on states. With this, differences between states become easily visible. Moreover, it is interesting to see how misleading GDP can be as an indication for average wealth when the population size is not considered, given that the two maps look very different from each other.

#GDP MAP OF US (total GDP --> economy size)
states <- read_sf(here::here("data", "cb_2018_us_state_500k.shp")) %>% 
  filter(!NAME %in% c("Guam", "Hawaii", "Commonwealth of the Northern Mariana Islands", "American Samoa", "United States Virgin Islands", "Alaska", "Puerto Rico"))

states_gdp <- states %>%  left_join(gdp_states_2020_temp, by=c("NAME" = "geo_name"))

us_gdp_map <- ggplot() + 
  geom_sf(data=states_gdp, aes(fill= GDP)) + 
  custom_layout + 
  scale_fill_gradient(low = "#CEE9F7", high = "#0032BF") +
  coord_sf(datum = NA) +
   labs(title="California has the highest GDP followed by Texas",
       subtitle = "GDP in million dollars per state (2020) ", 
       caption="Source: Bureau of Economic Analysis GDP Data 2020", 
       fill="GDP ($B)")
us_gdp_map

#GDP MAP OF US per CAPITA 
states_gdp_pc <- states %>%  left_join(gdp_perCapita_states_2020, by=c("NAME" = "geo_name")) %>% 
  #Filter out DC b/c it is not comparable (by far the highest GDP per capita)
  filter(NAME != "District of Columbia")

us_gdp_map_pc <- ggplot() + 
  geom_sf(data=states_gdp_pc, aes(fill= GDP_pc)) + 
  custom_layout + 
  scale_fill_gradient(low = "#CEE9F7", high = "#0032BF") +
  coord_sf(datum = NA)+
   labs(title="Washington DC, New York and Massachusetts have the highest GDP per capita",
       subtitle = "GDP per capita in dollars per state (2020) ", 
       caption="Source: Bureau of Economic Analysis GDP Data 2020, US Census 2019", 
       fill="GDP per capita ($)")
us_gdp_map_pc

Wine Consumption Analysis

Wine consumption growth

We also want to understand if wine is a good business to go into. Hence we first look at the wine consumption development over the years. We find that wine consumption has grown significantly over the last 15 years in the US.

library("readxl")

#Source: https://www.statista.com/statistics/233722/total-wine-consumption-of-the-us-by-wine-type/
#Wine Consumption in the U.S. by Wine Institute

wine_total_us <- read_excel( here::here("data","totalwineconsumption.xlsx"), sheet=2)

#Clean data and rename columns 
wine_total_us_clean <- wine_total_us[-1,]
wine_total_us_clean <- wine_total_us_clean[-1,] %>%  rename(year="Total wine consumption of the U.S. 2005-2020", consumption="...2") %>% mutate(consumption=as.numeric(consumption))

#Calculate consumption growth from first to last year
pct_growth= round((as.numeric(wine_total_us_clean[16,2])-as.numeric(wine_total_us_clean[1,2]))/as.numeric(wine_total_us_clean[1,2])*100,2)


#Create label for annotation
label <-paste("Total consumption grew by",as.character(pct_growth),"% \n over the last 15 years", by=" ")


wine_total_us_clean %>%  ggplot(aes(x=year, y=consumption)) + 
  geom_col(fill="#5C0303") + 
  custom_layout + 
  ylim(0, 1250)+
  labs(title="Wine consumption experienced high growth over the last 15 years",
       subtitle = "Total wine consumption in million gallons (2005-2020)", 
       caption="Source: Wine Institute - Wine consumption survery", 
       y="Consumption in million gallons") +
  
   #add arrow to graph
   geom_curve(
    data = data.frame(x = 1, y = 750, xend = 16, yend = 1050),
    mapping = aes(x = x, y = y, xend = xend, yend = yend),
    colour = "grey15",
    size = 0.5,
    curvature = -0.25,
    arrow = arrow(length = unit(2, "mm"), type = "closed"),
    inherit.aes = FALSE )  +

 # add the text label on the graph
  geom_text(
    data = data.frame(x = 6, y = 1200, label = label),
    aes(x = x, y = y, label = label),
    colour="grey15",
    family="PT Sans",
    hjust = 0.5,
    lineheight = .8,
    inherit.aes = FALSE)

Wine consumption by state

Similarly to the GDP analysis, we also want to understand the geographical distribution of wine consumption better. While California has the highest total consumption of wine, the New England region actually has the highest per capita consumption of wine.

library("readxl")

#Source: https://www.statista.com/statistics/942245/wine-consumption-in-the-us-by-state/
#National Institute on Alcohol Abuse and Alcoholism 

wine_by_state <- read_excel( here::here("data","wineconsumptionbystate.xlsx"), sheet=2)

#Clean data and rename columns 
wine_by_state_clean <- wine_by_state [-1,]
wine_by_state_clean <- wine_by_state_clean[-1,] %>%  rename(state="Wine consumption in the U.S. by state 2019", consumption="...2") %>% mutate(consumption=as.numeric(consumption))

wine_by_state_pop <- wine_by_state_clean %>%  left_join(state_pop, by=c("state"= "STATE")) %>%
  
#Consumption data reported in 1,000 of gallons 
#Caclulate per capita consumption 
  mutate(consumption_percapita= consumption/POPESTIMATE2019*1000)


wine_by_state_total_consumption <- wine_by_state_pop %>%  select(state, consumption)
wine_by_state_percapita_consumption <- wine_by_state_pop %>%  select(state, consumption_percapita)

states_wine_consumption <- states %>%  left_join(wine_by_state_total_consumption, by=c("NAME" = "state"))

states_wine_consumption_pc <- states %>%  left_join(wine_by_state_percapita_consumption, by=c("NAME" = "state"))

#Total wine consumption
us_wine_map <- ggplot() + 
  geom_sf(data=states_wine_consumption, aes(fill= consumption)) + 
  custom_layout + 
  scale_fill_gradient(low = "#FFDDDD", high = "#5C0303") +
  coord_sf(datum = NA)+
   labs(title="California has the highest total wine consumption",
       subtitle = "Wine consumption in 1,000 gallons", 
       caption="Source: National Institute on Alcohol Abuse and Alcoholism 2019", 
       fill="Total wine consumption")
us_wine_map

#Wine consumption per capita 
us_wine_map_pc <- ggplot() + 
  geom_sf(data=states_wine_consumption_pc, aes(fill= consumption_percapita)) + 
  custom_layout + 
  scale_fill_gradient(low = "#FFDDDD", high = "#5C0303") +
  coord_sf(datum = NA)+
   labs(title="New England states have the highest per capita wine consumption",
       subtitle = "Wine consumption per capita in gallons", 
       caption="Source: National Institute on Alcohol Abuse and Alcoholism 2019", 
       fill="Wine consumption per capita")
us_wine_map_pc

Retailer Data Analysis

We move on to analyse the data of an retailer to gain more information about what customers spend money on wine and how we could use that data to target specific customer groups.

Food basket analysis

We take a look at the entrie food basket of the customers of the retailer. We observe that wine actually has the highest proportion of spending, hence we can infer that the retailer is not a regular grocery chain but more likely a specilaized vendor known for its wine.

#COLOR WINE BOXPLOT
my_colours <- c("grey70", "#5C0303")

# Calculate total food basket and proportions of every category 
customer_food_basked <- profile_clean %>%  select(id,mnt_wines,mnt_fruits,mnt_meat_products,mnt_fish_products, mnt_sweet_products) %>% 
  mutate(food_basket=mnt_wines+mnt_fruits+mnt_meat_products+mnt_fish_products+mnt_sweet_products, 
         wine_prop=mnt_wines/food_basket, 
         fruit_prop=mnt_fruits/food_basket, 
         meat_prop=mnt_meat_products/food_basket, 
         fish_prop=mnt_fish_products/food_basket, 
         sweet_prop=mnt_sweet_products/food_basket)

#Plot proportion distribution 
customer_food_basked %>%
  select(id, wine_prop, fruit_prop, meat_prop, fish_prop, sweet_prop) %>%  
  pivot_longer(cols=2:6, names_to="type", values_to="proportion") %>%  
  mutate(type = fct_reorder(type, proportion, .fun='median')) %>% 
  mutate(wine=ifelse(type == "wine_prop", TRUE, FALSE)) %>% 
  ggplot(aes(x=reorder(type, desc(proportion)), y=proportion, group=type, color=wine)) +
  geom_boxplot() + 
  scale_x_discrete(labels= c("Wine", "Meat", "Fish", "Sweets", "Fruits")) +
  custom_layout + 
  scale_color_manual(values = my_colours)+
  theme(
    axis.title.y = element_blank(), 
    legend.position = "none" ) + 
  labs(title="Wine has highest proportion indicating that retailer is a delicatessen or specialized vendor",
       subtitle = "Proportion of food basket by food category", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Food category", 
       y= "Proportion of food basket")

Wine customer analysis

We know now that the customers of the retailer do appear to purchase a lot on wine, hence we want to use the data to gain more insights on what characteristics the high wine spenders have, which we could then ultimately use to target our customers. From the graphs below we can see that especially income and age appear to impact the median spending on wine consumption. This already gives us a first indication of what target customers we could focus on.

#We will use median as the wine spending data appears to be highly skewed right

my_colours <- c("grey70", "#5C0303")

#MARITAL STATUS
profile_clean %>% group_by(marital_status) %>%  
  summarise(median=median(mnt_wines), count=n()) %>%  
  filter(marital_status %in% c("Divorced", "Married", "Single","Together", "Widow")) %>% 
  mutate(wine=ifelse(marital_status == "Widow", TRUE, FALSE)) %>% 
  ggplot(aes(x=fct_reorder(marital_status, desc(median)), y=median, fill=wine)) + 
  geom_col() + 
  custom_layout +
  scale_fill_manual(values = my_colours)+
  theme(
    axis.title.y = element_blank(), 
    legend.position = "none" ) + 
  labs(title="Widowed people appear to spend much more on wine",
       subtitle = "Median wine spending by marital stuatus", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Marital status", 
       y= "Median wine spending")

#EDUCATION
profile_clean %>% group_by(education) %>%  
  summarise(median=median(mnt_wines), count=n()) %>% 
  filter(education %in% c("2n Cycle", "Graduation", "Master","PhD")) %>% 
  mutate(wine=ifelse(education == "PhD", TRUE, FALSE)) %>%
  ggplot(aes(x=education, y=median, fill=wine)) + 
  geom_col() + 
  custom_layout +
  scale_fill_manual(values = my_colours)+
  theme(
    axis.title.y = element_blank(), 
    legend.position = "none") + 
  labs(title="The higher the education, the more people spend on wine",
       subtitle = "Median wine spending by education level", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Education level", 
       y= "Median wine spending") 

#BY CHILDREN AT HOME 
profile_clean %>% group_by(kidhome) %>%  
  summarise(median=median(mnt_wines), count=n()) %>%  
  ggplot(aes(x=as.factor(kidhome), y=median)) + 
  geom_col() + 
  custom_layout +
  labs(title="Small children in the household decrease wine spending",
       subtitle = "Median wine spending by number of kids", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Number of kids at home", 
       y= "Median wine spending") 

#BY TEEN AT HOME 
profile_clean %>% group_by(teenhome) %>%  
  summarise(median=median(mnt_wines), count=n()) %>%  
  ggplot(aes(x=as.factor(teenhome), y=median)) + 
  geom_col() + 
  custom_layout +
  labs(title="Having teenagers in the house increases wine spending",
       subtitle = "Median wine spending by number of teenager", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Number of teenager at home", 
       y= "Median wine spending") 

#We will add column age and child for further analysis 
profile_clean['Age']= 2021-profile_clean$year_birth
profile_clean['Child']=profile_clean$kidhome+profile_clean$teenhome


#BY CHILDREN AT HOME (KIDS+TEENAGERS)
profile_clean %>% group_by(Child) %>%  
  summarise(median=median(mnt_wines), count=n()) %>%  
  ggplot(aes(x=as.factor(Child), y=median)) + 
  geom_col() + 
  custom_layout +
  labs(title="Customers without children at home spend more on wine",
       subtitle = "Median wine spending by number of children", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Number of children at home", 
       y= "Median wine spending") 

#BY AGE (only look at 30-70 because other categories are too small (not enough data))
profile_clean %>% group_by(Age) %>%  
  filter(Age>=30, Age <70) %>%
  mutate(age_group= case_when(
    Age %in% c(30,31,32,33,34) ~ "30-34", 
    Age %in% c(35,36,37,38,39) ~ "35-39",
    Age %in% c(40,41,42,43,44) ~ "40-44",
    Age %in% c(45,46,47,48,49) ~ "45-49",
    Age %in% c(50,51,52,53,54) ~ "50-54",
    Age %in% c(55,56,57,58,59) ~ "55-59",
    Age %in% c(60,61,62,63,64) ~ "60-64",
    Age %in% c(65,66,67,68,69) ~ "65-69",
  )) %>%  
  group_by(age_group) %>% 
  summarise(median=median(mnt_wines), count=n()) %>%  
  mutate(wine=ifelse(age_group %in% c("55-59", "60-64", "65-69"), TRUE, FALSE)) %>%
  ggplot(aes(x=age_group, y=median, fill=wine)) + 
  geom_col() + 
  custom_layout +
  scale_fill_manual(values = my_colours)+
  theme(
  axis.title.y = element_blank(), 
    legend.position = "none" ) + 
  labs(title="With age the wine expenditure appears to increase",
       subtitle = "Median wine spending by age group", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Age group", 
       y= "Median wine spending") 

## MEDIAN WINE SPENDING BY INCOME 

  profile_clean %>% group_by(income) %>%  
  mutate(income_group= case_when(
    income <= 30000 ~ "<30k",
    income <= 50000 ~ "30-50k",
    income <= 70000 ~ "50-70k", 
    income <= 90000 ~ "70-90k", 
    income > 90000 ~ ">90k")) %>%  
  group_by(income_group) %>% 
  summarise(median=median(mnt_wines), count=n()) %>%  
  mutate(income_group = fct_reorder(income_group, median, .fun='median')) %>% 
  mutate(wine=ifelse(income_group %in% c("70-90k", ">90k"), TRUE, FALSE)) %>%
  ggplot(aes(x=reorder(income_group, median), y=median, fill=wine)) + 
  geom_col() + 
  custom_layout +
  scale_fill_manual(values = my_colours)+
  theme(
  axis.title.y = element_blank(), 
    legend.position = "none" ) + 
  labs(title="High income groups tend to spend more on wine consumption",
       subtitle = "Median wine spending by income group", 
       caption="Source: Customer Personality Analysis Data Set", 
       x= "Annual income", 
       y= "Median wine spending") 

We observed that age appears to be an influencing factor for wine expenditure. However, we have not explored this variable before. To get a better understanding of the US in terms of median age, we look again at the map of the US by median age. It becomes apparent that the North Eastern region of the US appears to have a higher median age.

library("readxl")

#Source: https://www.statista.com/statistics/208048/median-age-of-population-in-the-usa-by-state/
#US Census Bureau 2019

median_age <- read_excel( here::here("data","medianageus.xlsx"), sheet=2)

#Clean data and rename columns 
median_age_clean <- median_age[-1,]
median_age_clean <- median_age_clean[-1,] %>% rename(state="Median age of U.S. population in  by state 2019", median_age="...2") %>% mutate(median_age=as.numeric(median_age))


median_age_pop <- median_age_clean  %>%  left_join(state_pop, by=c("state"= "STATE")) %>% 
  filter(state!="United States", state!="Puerto Rico")
  

median_age_pop_temp <- median_age_pop %>%  select(state, median_age)

states_median_age <- states %>%  left_join(median_age_pop_temp, by=c("NAME" = "state"))


#Median age by state
us_age_map <- ggplot() + 
  geom_sf(data=states_median_age, aes(fill= median_age)) + 
  custom_layout + 
  scale_fill_gradient(low = "#DEFFDA", high = "#064500") +
  coord_sf(datum = NA)+
   labs(title="Northeast has the highest median age",
       subtitle = "Median age by state", 
       caption="Source: US Census Bureau 2019", 
       fill="Median age")
us_age_map 

Correlation Analysis

And then we would like to see how these features are correlated with each other

library("readr")
profiles_cleaned <- read_csv(here::here("profiles_cleaned.csv"))
## New names:
## * `` -> ...1
## Rows: 440 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): education, marital_status, dt_customer
## dbl (27): ...1, id, year_birth, income, kidhome, teenhome, recency, mnt_wine...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
profiles_cleaned %>% 
  mutate(age = 2021-year_birth) %>% 
  select(age, income, kidhome, mnt_wines, mnt_fish_products, mnt_meat_products) %>% 
  ggcorr(label_round=2,label=TRUE,hjust=0.75 )

Customer cluster analysis

Next, we continue with a cluster analysis of the customers to actually learn more about the characteristics that influences their sending.

#removing NAs
df1=na.omit(profile_clean)
#calculate age
df1['age']= 2021-df1$year_birth

#number of promotions accepted
df1['accepted']=df1$accepted_cmp1+df1$accepted_cmp2+df1$accepted_cmp3+df1$accepted_cmp4+df1$accepted_cmp5
head(df1)
idyear_birtheducationmarital_statusincomekidhometeenhomedt_customerrecencymnt_winesmnt_fruitsmnt_meat_productsmnt_fish_productsmnt_sweet_productsmnt_gold_prodsnum_deals_purchasesnum_web_purchasesnum_catalog_purchasesnum_store_purchasesnum_web_visits_monthaccepted_cmp3accepted_cmp4accepted_cmp5accepted_cmp1accepted_cmp2complainz_cost_contactz_revenueresponseAgeChildageaccepted
5.52e+031.96e+03GraduationSingle5.81e+040004-09-2012586358854617288883810470000003111640640
2.17e+031.95e+03GraduationSingle4.63e+041108-03-2014381116216211250000003110672670
4.14e+031.96e+03GraduationTogether7.16e+040021-08-2013264264912711121421821040000003110560560
6.18e+031.98e+03GraduationTogether2.66e+041010-02-201426114201035220460000003110371370
5.32e+031.98e+03PhDMarried5.83e+041019-01-20149417343118462715553650000003110401400
7.45e+031.97e+03MasterTogether6.25e+040109-09-2013165204298042142641060000003110541540
#picking only the relevant variables for our analysis
df1<-df1 %>% 
  select(age, accepted, marital_status, income, education, teenhome, kidhome, mnt_wines) %>% 
  mutate(graduated=case_when(
    #we are only interested whether a person is highly educated or not since that has an influence on wine consumption
    education %in% c("Graduation", "PhD", "Master")~1,
    TRUE~0
    ),
    widow=case_when(
      #the only marital status of interest is widow since other don't affect wine consumption much
      marital_status=="Widow"~1,
      TRUE~0
    ))
#drop string variables
df1<-df1 %>% 
  select(-education, -marital_status)

We transform the data and determining the optimal K value

#scale the data 
df1 <- data.frame(scale(df1))

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#determining number of clusters - elbow method
fviz_nbclust(df1,kmeans,method="wss")+geom_vline(xintercept=2,linetype=2) + custom_layout

We can look at clustered data aand see that there are two main clusters we can group the customers by.

set.seed(123)

#perform kmeans clustering for k=2
model_kmeans_2clusters <- eclust(df1, "kmeans", k = 2, nstart=50, graph=FALSE)

#run pca and plot cluster division for 2 consumer clusters
fviz_cluster(model_kmeans_2clusters, dfc, geom = "point",ellipse.type = "norm",repel = TRUE, ggtheme = theme_minimal(), xlab="", ylab="")+
  ggtitle(label="We identified 2 main consumer clusters")+
  scale_color_manual(values=c("tomato", "cornflowerblue"))+
  theme(plot.title = element_text(face = "bold")) + custom_layout

#add clusters to the data frame
dfc_withClusters <- mutate(df1, cluster=as.factor(model_kmeans_2clusters$cluster))
cluster_centers <- data.frame(cluster=as.factor(c(1:2)),model_kmeans_2clusters$centers)

#transpose
cluster_centers_t <- cluster_centers %>% gather(variable, value, -cluster, factor_key=TRUE)

Visualizing cluster centers

The cluster centers actually help us to determine the characteristics that differentiation the two customer groups.

#plot the cluster centers
graphkmeans_2clusters <- ggplot(cluster_centers_t, aes(x=variable, y=value))+
  geom_line(aes(color=cluster, group=cluster), linetype="dashed", size=1)+
  geom_point(size=1, shape=4)+
  geom_hline(yintercept=0)+
  custom_layout+
  theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),
        plot.title = element_text(face = "bold"))+
  ggtitle("Consumers are separated on income, age and amout spent on wine", subtitle="Separation of 2 clusters based on different consumer features") +
  scale_color_manual(values=c("tomato", "cornflowerblue"))+
  labs(y="", x="")
graphkmeans_2clusters